qui {
noisily {
	/*************************************************/
	/* Step 1.4. Collect population and surface data */
	/*************************************************/
/*

- To gain panel data on surface data (in square km.), our strategy is the following:
	- Collect surface data (in square km.) come from Lake & O'Mahony (2007) and World Development Indicators (2016)
	- Linearly interpolate missing values in the first data source (interbellum years left open by LO)
	- Select a base series, the most complete dataset (in this case: LO)
	- Use the WDI data to approximate missing values in the base series.

		* IF THE MISSING VALUES ARE RELATED TO A COUNTRY ALREADY APPEARING IN LO:
		-> Use the evolution in surface area in this selected dataset to complete missings in the LO data, eg.
			surface_it = (surface_selected_it/surface_selected_it-1)*surface_LO_it-1 if surface_it is missing
			surface_it = (surface_selected_it/surface_selected_it+1)*surface_LO_it+1 if surface_it is missing
			
		IF THE MISSING VALUES ARE RELATED TO A COUNTRY NOT COVERED BY LO:
		-> approximate by using surface data in WDI 
			surface_it = surface_WDI_it if surface_it is missing
			
- To gain panel data on the evolution of population, our strategy is the following:
	- Collect population data from various sources 
	- Linearly interpolate missing values in all data sources
	- Select a base series, the most complete dataset (in this case: COW)
	- From the additional sources, select the one for which the overlapping population paths are most strongly correlated with the COW data.
	- Use this data to approximate missing values in the base series.
	
		* IF THE MISSING VALUES ARE RELATED TO A COUNTRY ALREADY APPEARING IN COW:
		-> Use the evolution in population in this selected dataset to complete missings in the COW data, eg.
			pop_it = (pop_selected_it/pop_selected_it-1)*pop_COW_it-1 if pop_it is missing
			pop_it = (pop_selected_it/pop_selected_it+1)*pop_COW_it+1 if pop_it is missing
			
		IF THE MISSING VALUES ARE RELATED TO A COUNTRY NOT COVERED BY COW:
		-> approximate by using population in the US as a benchmark
			pop_it = (pop_selected_it/pop_selected_USt)*pop_COW_USt if pop_it is missing
			
	- Proceed by using the second most strongly correlated source and fill in as much missings as possible
	- Result = composite index of population
*/
}

/* 1.4.1. Load data */
cd ..
cd ".\3. Intermediary results"
foreach name in  "barlee" "CLIO" "COW_NC" "Madison" "Madison2018" "PWT56" "PWT8" "WDI"  {
		merge 1:1 year cntrycode_`name' using `name', keepusing(pop_`name')
		if "`name'" == "WDI" {
			drop if _merge == 2
			drop _merge
			merge 1:1 year cntrycode_`name' using `name', keepusing(surface_`name')
			}
		destring pop_`name', replace force
		drop if _merge == 2
		drop _merge
		drop if cntry == ""
		}
merge 1:1 year cntrycode_LM using LM, keepusing(surface_LM)
drop if _merge == 2
drop _merge

 noisily {
* 1.  Surface Area of Separate Countries
}

/* 1.4.2. Generate dependent variable, use LM data as base series (most observations) */

	* Linearly interpolate missing values (primarily interbellum years, left open by LO)

		* Interpolate
		gen interpolated_LM = 0 if surface_LM != .
		bysort cntrycode: ipolate surface_LM year, gen(interpolated_surface)
		replace surface_LM = interpolated_surface if surface_LM == .
		drop interpolated_surface
		replace interpolated_LM = 1 if interpolated_LM == . & surface_LM != .
		gen interpolated_WDI = 0
		
		* Generate baseline series
		gen surface = surface_LM
		
		* Identify source
		gen source_surface = 1 if surface != . 	// 1 = LM, 2 = WDI
		scalar correlation_surface_1 = 1
		
		* Identify interpolated parts
		gen interpolated_surface = interpolated_LM if surface != .
		
		* Identify extrapolated parts
		gen extrapolated_surface = 0  if surface != .
		
		* identify polynomial predicted parts
		gen polynomialpredicted_surface = 0  if surface != .

/* 1.4.3. Complete by using other data sources */
xtset cntrycode year
	
foreach dataset in "WDI"  {	
	corr surface_LM surface_`dataset' if year >= $startyear & independence_years != .
	scalar correlation_surface_2 = round(`r(rho)',.01)
	
		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum surface if independence_years != . & year >= $startyear
			local original = r(N)
			replace surface = (1+(surface_`dataset'-L.surface_`dataset')/L.surface_`dataset')*L.surface if surface == .
			sum surface if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
									
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum surface if independence_years != . & year >= $startyear
			local original = r(N)
			replace surface = (1+(surface_`dataset'-F.surface_`dataset')/F.surface_`dataset')*F.surface if surface == .
			sum surface if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
			
		* Identify source
		replace source_surface = 992 if surface != . & source_surface == . // 2 = WDI
		
		* Identify extrapolated parts
		replace extrapolated_surface = 1  if surface != . & extrapolated_surface == .
		
		* Extend for countries missing in non-overlapping time periods in reference data series
		replace surface = surface_`dataset' if surface == .
		
		* Identify source
		replace source_surface = 2 if surface != . & source_surface == . // 2 = WDI
		
		* Identify extrapolated parts
		replace extrapolated_surface = 0 if surface != . & extrapolated_surface == .
		}
	
	* Extend forward/backward for missing data on the remaining countries, otherwise use nearest information
	forval i=1/200 {
		by cntrycode: replace surface = L.surface if surface == .
		by cntrycode: replace surface = F.surface if surface == .
		}
	
	* Identify source
	replace source_surface = 0 if surface != . & source_surface == . // 990 = interpolated
	
	* Identify interpolated parts
	replace interpolated_surface = interpolated_WDI  if surface != . & interpolated_surface == . & source_surface != 0
	
	* Identify extrapolated parts
	replace extrapolated_surface = 1  if surface != . & extrapolated_surface == .
		
	* identify polynomial predicted parts
	replace polynomialpredicted_surface = 0  if surface != . & polynomialpredicted_surface == .
		
/* 1.4.4. Compute log surface for later usage */

gen lsurface = ln(surface)

/* 1.4.5. Label relevant variables */

label var surface "Surface area (sq. km.)"
label var lsurface "Log surface area (sq. km.)"

/* 1.4.6. Drop raw data */

drop surface_*
		
 noisily {
* 2. Identify mother countries
}

/* 1.4.7. Add COW independence dates */

	* Merge data
	merge 1:1 cntrycode_COW_SM year using COW_independence
	drop _merge
	
/* 1.4.8. Add data on colonial contiguency */

	* Save country names and identifiers for later usage
	preserve
	duplicates drop cntrycode, force
	keep cntry cntrycode_COW
	rename cntrycode_COW mother
	rename cntry mname
	save cntrys, replace
	restore
	
	preserve
	duplicates drop cntrycode, force
	keep cntrycode_COW cntry
	rename cntrycode_COW mother_country
	rename cntry mother_country_name
	save cntrys_2, replace
	restore
			
	* Load data
	preserve
	use COW_TC, clear
			
	* Define necessary variables
	keep if portion == 1 // Only keep cases when an entire entity changed ownership
	duplicates drop cntrycode_COW_TC year, force
	gen mother = loser
	gen new_owner = gainer
	keep cntrycode_COW_TC mother new_owner year entry exit indep
	keep if indep == 1 	// Only keep territorial changes resulting from secession
	xtset cntrycode_COW_TC year
	sort cntrycode_COW_TC year
	
	* Correct Senegalese mother country from Mali to France
	replace mother = 220	 if cntrycode_COW_TC == 433 & year == 1960
	
	* Add name of mother country
	merge m:1 mother using cntrys
	drop if _merge == 2
	drop _merge
	replace mname = "Mandated Territory (UN)" if mother == 1 
	replace mname = "Missing/irrelevant" if mother == -9
	
	* Add independence spring information
	sort year cntrycode_COW_TC
	bysort cntrycode_COW_TC: gen independence_spring_COW = _n
	
	* Save necessary information
	keep cntrycode_COW_TC independence_spring_COW mother mname
	save TC, replace
	restore
	
	* Merge data
	merge m:1 cntrycode_COW_TC independence_spring_COW using TC
	drop if _merge == 2
	drop _merge

	* Correct post-independence information
	replace mother = cntrycode_COW_TC if independence_years >= 0
	replace mname = cntry if independence_years >= 0
		
	* Identify mother country: longest colonizer
	gen mother_country = .
	foreach cntry of numlist 1/1000 {
		di `cntry', _continue
		sum year if cntrycode_COW == `cntry'
		if r(N) > 0 {
			egen mother_country_`cntry' = mode(mother) if mother != `cntry' & cntrycode_COW == `cntry'
			sum mother_country_`cntry'
			replace mother_country = r(mean) if cntrycode_COW == `cntry'
			drop mother_country_`cntry'
			}
		}
			
	* Include names of mother countries
	merge m:1 mother_country using cntrys_2
	drop if _merge == 2
	drop _merge
	erase cntrys_2.dta
	replace mother_country_name = "Russia" if cntry == "Russia"
	
	* Label variables
	label var mother_country "Unique country considered to be the mother country prior to independence"
	label var mother "COW country code of colonial power (identical to cntrycode_COW after independence)"
	label var mname "Colonial power"
				
	rename mname mother_cntry
	rename mother_country cntrycode_mother
	
/* 1.4.9. Drop unnecessary data */

drop statenme independence_years_COW independence_spring_COW mother


noisily {
* 3. Population data
}

/* 1.4.10. Express total population in absolute numbers */
replace pop_Madison = 1000*pop_Madison
replace pop_Madison2018 = 1000*pop_Madison2018
replace pop_COW_NC = 1000*pop_COW_NC
replace pop_PWT56 = 1000*pop_PWT56
replace pop_barlee = 1000*pop_barlee
replace pop_CLIO = 1000*pop_CLIO

/* 1.4.11. Label variables, indicating their sources */
label var pop_WDI "Population (source: WDI)"
label var pop_COW_NC "Population (source: COW_NC)"
label var pop_Madison "Population (source: Madison)"
label var pop_barlee "Population (source: barlee)"
label var pop_PWT56 "Population (source: PWT 5.6)"
label var pop_PWT8 "Population (source: PWT 8.0)"
label var pop_CLIO "Population (source: CLIO)"
 
/* 1.4.12. Linearly interpolate missing values. */
sort cntry year
local source = 0

foreach name in "COW_NC" "Madison2018" "WDI" "Madison" "PWT8" "CLIO" "PWT56" "barlee" {
	local source = `source'+1
	local source2 = 990+`source'
	gen interpolated_pop_`name' = 0 if pop_`name' != .
	gen pop_`name'_raw = pop_`name' 								// Keep raw data, to track number of linearly interpolated contributions
	by cntry: ipolate pop_`name' year, generate(pop_`name'1)
	replace pop_`name' = pop_`name'1
	drop pop_`name'1
	replace interpolated_pop_`name' = 1 if interpolated_pop_`name' == . & pop_`name' != .
	}
	
/* 1.4.13. Generate dependent variable, use COW data as base series (most observations) */
gen pop = pop_COW_NC
gen source_pop = 1 if pop != .	// 1 = COW_NC, 2 = Madison2018, 3 = WDI, 4 = Madison, 5 = PWT8, 6 = CLIO, 7 = PWT56, 8 = barlee

scalar correlation_pop_1 = 1

* Identify interpolated parts
gen interpolated_pop = 0 & pop != .
replace interpolated_pop = 1 if interpolated_pop_COW_NC == 1 & pop != .

* Identify extrapolated parts
gen extrapolated_pop = 0  if pop != .

* identify polynomial predicted parts
gen polynomialpredicted_pop = 0  if pop != .

/* 1.4.14. Complete by using other data sources, giving primacy to the most extensive datasets */
xtset cntrycode year
local source = 1
foreach dataset in "Madison2018" "WDI" "Madison" "PWT8" "CLIO" "PWT56" {	

	* Keep track of source #
	local source = `source' + 1
	local source2 = 990+`source'
	
	* Compute correlation with baseline series
	corr pop_COW_NC pop_`dataset' if year >= $startyear & independence_years != .
	scalar correlation_pop_`source' = round(`r(rho)',.01)

		* Extend forward 
		local i = 999
		while `i' != 0 {
			sum pop if independence_years != . & year >= $startyear
			local original = r(N)
			replace pop = (1+(pop_`dataset'-L.pop_`dataset')/L.pop_`dataset')*L.pop if pop == .
			sum pop if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
	
		* Extend backward
		local i = 999
		while `i' != 0 {
			sum pop if independence_years != . & year >= $startyear
			local original = r(N)
			replace pop = (1+(pop_`dataset'-F.pop_`dataset')/F.pop_`dataset')*F.pop if pop == .
			sum pop if independence_years != . & year >= $startyear
			local extended = r(N)
			local i = `extended' - `original'
			}
			
		* Identify source
		replace source_pop = `source2' if source_pop == . & pop != .
		
		* Identify interpolated parts
		replace interpolated_pop = 1  if pop != . & interpolated_pop_`dataset' == 1 & interpolated_pop == .
					
		* Identify extrapolated parts
		replace extrapolated_pop = 1  if pop != . & extrapolated_pop == .
				
		* Extend for countries missing in non-overlapping time periods in reference data series
		replace pop = pop_`dataset' if pop == .
		
		* Identify source
		replace source_pop = `source' if source_pop == . & pop != .

		* Identify interpolated parts
		replace interpolated_pop = 1  if pop != . & interpolated_pop_`dataset' == 1 & interpolated_pop == .
		
		* Identify extrapolated parts
		replace extrapolated_pop = 0 if pop != . & extrapolated_pop == .
	}

	
* identify polynomial predicted parts
replace polynomialpredicted_pop = 0  if pop != .

/* 1.4.15. collect data on population density */

gen popdens = pop/surface

/* 1.4.16. Compute log population for later usage */

gen lpop = ln(pop)

/* 1.4.17. Compute population shares for later usage */

gen popshare = .
foreach year of numlist $startyear / 2016 {
	sum pop if independence_years >= 0 & independence_years != . & year == `year'
	replace popshare = pop/`r(sum)' if year == `year'
	}

/* 1.4.18. Label relevant variables */

label var pop "Absolute number of inhabitants: various sources"
label var lpop "Log population: various sources"
label var popdens "Population density: population/surface area"

/* 1.4.19. Drop raw data */

drop pop_*

* Reroute to directory containing dofiles
cd ..
cd ".\1. Dofiles"
}
